Zilow Zestimate

In [1]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from random import shuffle
from gc import collect

from bokeh.charts import show
from bokeh.plotting import figure
from bokeh.layouts import gridplot
from bokeh.io import output_notebook
output_notebook()

import warnings

warnings.filterwarnings("ignore")
/home/tales/anaconda3/lib/python3.5/site-packages/bokeh/util/deprecation.py:34: BokehDeprecationWarning: 
The bokeh.charts API has moved to a separate 'bkcharts' package.

This compatibility shim will remain until Bokeh 1.0 is released.
After that, if you want to use this API you will have to install
the bkcharts package explicitly.

  warn(message)
Loading BokehJS ...

The Data

In [2]:
def normalize(column):
    norm_column = (column - column.mean()) / (column.max() - column.min())
    return norm_column
    
def normalize_columns(data, columns):
    df = data.copy()
    for column in columns:
        df[column] = normalize(df[column])
    return df
In [3]:
# data = pd.read_csv("../data/train_complete_2016.csv")
# data.shape

train = pd.read_csv("../data/train.csv", index_col=0)
test = pd.read_csv("../data/test.csv", index_col=0)

# test_target = test["logerror"]
# train_target = train["logerror"]

# del test["logerror"]
# del train["logerror"]
In [4]:
train.head()
Out[4]:
parcelid logerror transactiondate airconditioningtypeid architecturalstyletypeid basementsqft bathroomcnt bedroomcnt buildingclasstypeid buildingqualitytypeid ... numberofstories fireplaceflag structuretaxvaluedollarcnt taxvaluedollarcnt assessmentyear landtaxvaluedollarcnt taxamount taxdelinquencyflag taxdelinquencyyear censustractandblock
66450 12149460 0.0363 2016-08-12 1.0 NaN NaN 3.0 4.0 NaN 4.0 ... NaN NaN 180637.0 722923.0 2015.0 542286.0 8080.91 NaN NaN 6.037300e+13
16762 11962551 0.0218 2016-03-16 1.0 NaN NaN 4.0 3.0 NaN 4.0 ... NaN NaN 182151.0 291440.0 2015.0 109289.0 3690.38 NaN NaN 6.037186e+13
63556 10734901 0.0100 2016-08-04 1.0 NaN NaN 3.0 3.0 NaN 4.0 ... NaN NaN 141380.0 193171.0 2015.0 51791.0 6234.70 NaN NaN 6.037137e+13
36025 11104067 0.0060 2016-05-17 NaN NaN NaN 3.0 4.0 NaN 7.0 ... NaN NaN 163569.0 272612.0 2015.0 109043.0 4848.37 NaN NaN 6.037920e+13
30783 12678699 -0.0111 2016-04-29 NaN NaN NaN 2.0 3.0 NaN 7.0 ... NaN NaN 27108.0 91344.0 2015.0 64236.0 1305.15 NaN NaN 6.037651e+13

5 rows × 60 columns

In [5]:
test.head()
Out[5]:
parcelid logerror transactiondate airconditioningtypeid architecturalstyletypeid basementsqft bathroomcnt bedroomcnt buildingclasstypeid buildingqualitytypeid ... numberofstories fireplaceflag structuretaxvaluedollarcnt taxvaluedollarcnt assessmentyear landtaxvaluedollarcnt taxamount taxdelinquencyflag taxdelinquencyyear censustractandblock
1 14366692 -0.1684 2016-01-01 NaN NaN NaN 3.5 4.0 NaN NaN ... NaN NaN 346458.0 585529.0 2015.0 239071.0 10153.02 NaN NaN NaN
3 12643413 0.0218 2016-01-02 1.0 NaN NaN 2.0 2.0 NaN 4.0 ... NaN NaN 171518.0 244880.0 2015.0 73362.0 3048.74 NaN NaN 6.037296e+13
4 14432541 -0.0050 2016-01-02 NaN NaN NaN 2.5 4.0 NaN NaN ... 2.0 NaN 169574.0 434551.0 2015.0 264977.0 5488.96 NaN NaN 6.059042e+13
5 11509835 -0.2705 2016-01-02 1.0 NaN NaN 4.0 4.0 NaN 1.0 ... NaN NaN 880650.0 2447951.0 2015.0 1567301.0 27126.57 NaN NaN 6.037621e+13
11 11672170 -0.0161 2016-01-03 1.0 NaN NaN 4.0 5.0 NaN 1.0 ... NaN NaN 559040.0 1090127.0 2015.0 531087.0 13428.94 NaN NaN 6.037263e+13

5 rows × 60 columns

In [6]:
numeric_columns = ['basementsqft', 'bathroomcnt', 'bedroomcnt', 
                   'threequarterbathnbr', 'finishedfloor1squarefeet', 
                   'calculatedfinishedsquarefeet', 
                   'finishedsquarefeet6', 'finishedsquarefeet12', 'finishedsquarefeet13', 
                   'finishedsquarefeet15', 'finishedsquarefeet50', 'fireplacecnt', 
                   'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'latitude', 
                   'longitude', 'lotsizesquarefeet', 'poolsizesum', 'roomcnt', 'unitcnt', 
                   'yardbuildingsqft17', 'yardbuildingsqft26', 'yearbuilt', 'taxamount',
                   'structuretaxvaluedollarcnt', 'landtaxvaluedollarcnt']

Attribute correlation

Plotting attribute correlation with the target column.

In [7]:
def plot_correlation(data, column_name, alpha=0.2, sample_ratio=1):
    p = figure(plot_width=300, plot_height=300, title=column_name + " vs " + "logerror", tools=["xwheel_zoom", "xpan"])
    n_sample = int(sample_ratio * len(data))
    data_plot = data.sample(n_sample)
    p.circle(data_plot[column_name], data_plot["logerror"], alpha=alpha)
    return p

def correlations_df(data, column_name):
    corr = data[column_name].corr(data["logerror"])
    sign = '+' if corr >= 0 else '-'
    corr = corr if corr >= 0 else corr * -1
    
    return corr_df.append(pd.DataFrame([{"attribute": column_name, "sign": sign, "corr": corr}]))
    
In [11]:
corr_df = pd.DataFrame()
grid = [[]]

for column_name in numeric_columns: 
    corr_df = correlations_df(train, column_name)
    
    p = plot_correlation(train, column_name, alpha=0.1, sample_ratio=0.25)
    
    if len(grid[-1]) % 3 == 0:
        grid.append([p])
    else:
        grid[-1].append(p)
        
show(gridplot(grid))
W-1002 (EMPTY_LAYOUT): Layout has no children: Row(id='fbdc6411-34e8-4ea1-8311-1e9571018734', ...)
In [12]:
# corr_df.sort_values("corr", ascending=False)

Missing values

Replacing missing values with column mode

In [13]:
def column_median(column):
    return column.median()
In [14]:
data = train.append(test)
In [15]:
sample5 = train.sample(5)
sample5[numeric_columns]
Out[15]:
basementsqft bathroomcnt bedroomcnt threequarterbathnbr finishedfloor1squarefeet calculatedfinishedsquarefeet finishedsquarefeet6 finishedsquarefeet12 finishedsquarefeet13 finishedsquarefeet15 ... lotsizesquarefeet poolsizesum roomcnt unitcnt yardbuildingsqft17 yardbuildingsqft26 yearbuilt taxamount structuretaxvaluedollarcnt landtaxvaluedollarcnt
58080 NaN 2.0 4.0 NaN NaN 1639.0 NaN 1639.0 NaN NaN ... 6423.0 NaN 0.0 1.0 NaN NaN 1960.0 1549.34 51175.0 66480.0
5821 NaN 3.0 4.0 NaN NaN 2215.0 NaN 2215.0 NaN NaN ... 4051.0 NaN 0.0 1.0 NaN NaN 2005.0 6544.83 352603.0 182260.0
55621 NaN 3.0 4.0 NaN NaN 2023.0 NaN 2023.0 NaN NaN ... 8826.0 NaN 0.0 1.0 NaN NaN 1981.0 3904.48 232028.0 77386.0
34626 NaN 3.0 3.0 NaN NaN 1850.0 NaN 1850.0 NaN NaN ... 4675.0 NaN 0.0 1.0 NaN NaN 1921.0 16922.79 240373.0 1149612.0
79578 NaN 2.5 3.0 1.0 NaN 1890.0 NaN 1890.0 NaN NaN ... 6320.0 NaN 0.0 NaN NaN NaN 1994.0 5604.22 209055.0 202531.0

5 rows × 27 columns

In [16]:
for column_name in numeric_columns:
    mode = column_median(data[column_name])
    data[column_name] = data[column_name].fillna(mode)
In [17]:
sample5 = data.loc[sample5.index.tolist()]
sample5[numeric_columns]
Out[17]:
basementsqft bathroomcnt bedroomcnt threequarterbathnbr finishedfloor1squarefeet calculatedfinishedsquarefeet finishedsquarefeet6 finishedsquarefeet12 finishedsquarefeet13 finishedsquarefeet15 ... lotsizesquarefeet poolsizesum roomcnt unitcnt yardbuildingsqft17 yardbuildingsqft26 yearbuilt taxamount structuretaxvaluedollarcnt landtaxvaluedollarcnt
58080 616.0 2.0 4.0 1.0 1244.0 1639.0 2028.0 1639.0 1440.0 2104.5 ... 6423.0 500.0 0.0 1.0 259.5 159.0 1960.0 1549.34 51175.0 66480.0
5821 616.0 3.0 4.0 1.0 1244.0 2215.0 2028.0 2215.0 1440.0 2104.5 ... 4051.0 500.0 0.0 1.0 259.5 159.0 2005.0 6544.83 352603.0 182260.0
55621 616.0 3.0 4.0 1.0 1244.0 2023.0 2028.0 2023.0 1440.0 2104.5 ... 8826.0 500.0 0.0 1.0 259.5 159.0 1981.0 3904.48 232028.0 77386.0
34626 616.0 3.0 3.0 1.0 1244.0 1850.0 2028.0 1850.0 1440.0 2104.5 ... 4675.0 500.0 0.0 1.0 259.5 159.0 1921.0 16922.79 240373.0 1149612.0
79578 616.0 2.5 3.0 1.0 1244.0 1890.0 2028.0 1890.0 1440.0 2104.5 ... 6320.0 500.0 0.0 1.0 259.5 159.0 1994.0 5604.22 209055.0 202531.0

5 rows × 27 columns

In [19]:
for column in numeric_columns:
    #columns must not contains nan
    assert (False == (True in data[column].isnull().tolist()))
Normalizing the data
In [20]:
norm_data = normalize_columns(data, numeric_columns)

sample5 = norm_data.loc[sample5.index.tolist()]
sample5[numeric_columns]
Out[20]:
basementsqft bathroomcnt bedroomcnt threequarterbathnbr finishedfloor1squarefeet calculatedfinishedsquarefeet finishedsquarefeet6 finishedsquarefeet12 finishedsquarefeet13 finishedsquarefeet15 ... lotsizesquarefeet poolsizesum roomcnt unitcnt yardbuildingsqft17 yardbuildingsqft26 yearbuilt taxamount structuretaxvaluedollarcnt landtaxvaluedollarcnt
58080 -0.000032 -0.013891 0.06052 -0.000385 -0.001035 -0.005766 -0.000183 -0.004667 0.000024 -0.000488 ... -0.002885 -0.000123 -0.081655 -0.0005 -0.000556 -0.000119 -0.065798 -0.013750 -0.012910 -0.008627
5821 -0.000032 0.036109 0.06052 -0.000385 -0.001035 0.019565 -0.000183 0.024117 0.000024 -0.000488 ... -0.003225 -0.000123 -0.081655 -0.0005 -0.000556 -0.000119 0.280356 0.001769 0.017390 -0.003901
55621 -0.000032 0.036109 0.06052 -0.000385 -0.001035 0.011121 -0.000183 0.014522 0.000024 -0.000488 ... -0.002540 -0.000123 -0.081655 -0.0005 -0.000556 -0.000119 0.095740 -0.006434 0.005269 -0.008181
34626 -0.000032 0.036109 -0.00198 -0.000385 -0.001035 0.003513 -0.000183 0.005877 0.000024 -0.000488 ... -0.003135 -0.000123 -0.081655 -0.0005 -0.000556 -0.000119 -0.365798 0.034010 0.006108 0.035583
79578 -0.000032 0.011109 -0.00198 -0.000385 -0.001035 0.005272 -0.000183 0.007876 0.000024 -0.000488 ... -0.002899 -0.000123 -0.081655 -0.0005 -0.000556 -0.000119 0.195740 -0.001153 0.002960 -0.003073

5 rows × 27 columns

In [21]:
data = norm_data
train = data.iloc[train.index.tolist()]
test = data.iloc[test.index.tolist()]

data = None
norm_data = None
collect()
Out[21]:
2633

Modeling v1 - Raw Attributes

In [22]:
from sklearn.linear_model import SGDRegressor, Ridge
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest
from numpy import median
In [23]:
def scorer(estimator, X, y):
    pred = estimator.predict(X)
    return mean_absolute_error(pred, y)

def scores(model, train, train_target, test, test_target, k, scorer=scorer):
    trained_model = model.fit(train, train_target)
    
    test_score = scorer(trained_model, test, test_target)
    train_score = scorer(trained_model, train, train_target)

    cv_scores = cross_val_score(model, train, train_target, cv=k, scoring=scorer)
    
    return {"train": train_score, "test": test_score, "validation": median(cv_scores)}
Use columns
In [24]:
skb = SelectKBest(k=10)
fit = skb.fit(train[numeric_columns], train["logerror"])

use_columns = train[numeric_columns].columns[fit.get_support()].tolist()
use_columns
Out[24]:
['basementsqft',
 'bathroomcnt',
 'threequarterbathnbr',
 'calculatedfinishedsquarefeet',
 'finishedsquarefeet12',
 'finishedsquarefeet15',
 'yearbuilt',
 'taxamount',
 'structuretaxvaluedollarcnt',
 'landtaxvaluedollarcnt']
In [25]:
results = []
k = 5
Train and test partitioning
In [26]:
# train, test = partition(data[use_columns + ["logerror"]], train_proportion=0.7)
In [27]:
collect()
Out[27]:
7

Linear Regression

Stochastic Gradient Descent Regressor
In [28]:
model = SGDRegressor(alpha=0.001, max_iter=1000)
scores_dict = scores(model, train[use_columns], train["logerror"], 
                            test[use_columns],  test["logerror"], k, scorer=scorer)
scores_dict
Out[28]:
{'test': 0.068941943347990406,
 'train': 0.069134767281800527,
 'validation': 0.069087687956317895}
In [29]:
results.append({"model": "linear SGDRegressor", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "alpha0.001, max_iter=1000"})
In [30]:
model = SGDRegressor(alpha=0.0001, max_iter=2000)
scores_dict = scores(model, train[use_columns], train["logerror"], 
                            test[use_columns],  test["logerror"], k, scorer=scorer)
scores_dict
Out[30]:
{'test': 0.069129518777583623,
 'train': 0.069304754444278541,
 'validation': 0.069216484252882432}
In [31]:
results.append({"model": "linear SGDRegressor", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "alpha0.0001, max_iter=2000"})
In [32]:
model = SGDRegressor(alpha=0.0001, max_iter=1000)
scores_dict = scores(model, train[use_columns], train["logerror"], 
                            test[use_columns],  test["logerror"], k, scorer=scorer)
scores_dict
Out[32]:
{'test': 0.069004117788247554,
 'train': 0.069189205846361082,
 'validation': 0.069195250276674072}
In [33]:
results.append({"model": "linear SGDRegressor", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "alpha0.0001, max_iter=1000"})
Ridge
In [34]:
model = Ridge()
scores_dict = scores(model, train[use_columns], train["logerror"], 
                            test[use_columns],  test["logerror"], k, scorer=scorer)
scores_dict
Out[34]:
{'test': 0.069102667243088237,
 'train': 0.069251340103078887,
 'validation': 0.069236407107031175}
In [35]:
results.append({"model": "linear Ridge", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"]})
In [36]:
model = Ridge(alpha=2.0)
scores_dict = scores(model, train[use_columns], train["logerror"], 
                            test[use_columns],  test["logerror"], k, scorer=scorer)
scores_dict
Out[36]:
{'test': 0.069086030610211263,
 'train': 0.069242821053735332,
 'validation': 0.069215236326736984}
In [37]:
results.append({"model": "linear Ridge", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "alpha=2"})
In [38]:
model = Ridge(alpha=10.0)
scores_dict = scores(model, train[use_columns], train["logerror"], 
                            test[use_columns],  test["logerror"], k, scorer=scorer)
scores_dict
Out[38]:
{'test': 0.069036535323963144,
 'train': 0.069217453224532974,
 'validation': 0.069193338549299557}
In [39]:
results.append({"model": "linear Ridge", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "alpha=10"})
In [40]:
model = Ridge(alpha=0.5)
scores_dict = scores(model, train[use_columns], train["logerror"], 
                            test[use_columns],  test["logerror"], k, scorer=scorer)
scores_dict
Out[40]:
{'test': 0.069118810388897028,
 'train': 0.069261987467230945,
 'validation': 0.069260194143717618}
In [41]:
results.append({"model": "linear Ridge", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "alpha=0.5"})
In [42]:
model = Ridge(alpha=0.2)
scores_dict = scores(model, train[use_columns], train["logerror"], 
                            test[use_columns],  test["logerror"], k, scorer=scorer)
scores_dict
Out[42]:
{'test': 0.069136805244867747,
 'train': 0.06927665989357143,
 'validation': 0.069298657464719485}
In [43]:
results.append({"model": "linear Ridge", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "alpha=0.2"})

Polynomial Regression

In [44]:
from sklearn.preprocessing import PolynomialFeatures

def polynomial_features(df, degree=2):
    poly = PolynomialFeatures(degree=degree)
    polinomial_data = pd.DataFrame(poly.fit_transform(df))
    polinomial_data.columns = poly.get_feature_names()
    return polinomial_data
In [45]:
poly_test = polynomial_features(test[use_columns], degree=3)
poly_test.head()

poly_train = polynomial_features(train[use_columns], degree=3)
poly_train.head()
Out[45]:
1 x0 x1 x2 x3 x4 x5 x6 x7 x8 ... x7^3 x7^2 x8 x7^2 x9 x7 x8^2 x7 x8 x9 x7 x9^2 x8^3 x8^2 x9 x8 x9^2 x9^3
0 1.0 -0.000032 0.036109 -0.000385 0.048414 0.056899 -0.000488 0.295740 0.027664 0.020445 ... 2.117163e-05 1.564709e-05 1.036118e-06 1.156413e-05 7.657525e-07 5.070652e-08 8.546582e-06 5.659367e-07 3.747514e-08 2.481525e-09
1 1.0 -0.000032 0.036109 -0.000385 0.027877 0.033562 -0.000488 -0.142721 0.024678 0.005727 ... 1.502915e-05 3.487821e-06 1.534603e-05 8.094198e-07 3.561359e-06 1.566959e-05 1.878423e-07 8.264859e-07 3.636448e-06 1.599998e-05
2 1.0 -0.000032 -0.013891 -0.000385 -0.012363 -0.012163 -0.000488 0.134202 -0.007999 -0.003422 ... -5.117662e-07 -2.189170e-07 -6.210778e-07 -9.364564e-08 -2.656770e-07 -7.537379e-07 -4.005858e-08 -1.136480e-07 -3.224247e-07 -9.147339e-07
3 1.0 -0.000032 -0.013891 -0.000385 -0.024061 -0.025456 -0.000488 -0.111952 -0.015789 -0.015375 ... -3.936126e-06 -3.832972e-06 -2.592365e-06 -3.732522e-06 -2.524427e-06 -1.707353e-06 -3.634704e-06 -2.458270e-06 -1.662609e-06 -1.124477e-06
4 1.0 -0.000032 -0.013891 -0.000385 -0.008185 -0.007416 -0.000488 -0.342721 0.004212 0.002405 ... 7.472946e-08 4.266833e-08 7.849464e-08 2.436236e-08 4.481814e-08 8.244953e-08 1.391019e-08 2.558984e-08 4.707626e-08 8.660368e-08

5 rows × 286 columns

In [46]:
skb = SelectKBest(k=10)
fit = skb.fit(poly_train, train["logerror"])

use_poly_columns = poly_train.columns[fit.get_support()].tolist()
poly_train = poly_train[use_poly_columns]
print(poly_train.columns.tolist())
poly_train.head()
['x0 x3^2', 'x0 x3 x4', 'x0 x3 x7', 'x0 x3 x8', 'x0 x3 x9', 'x0 x4^2', 'x0 x4 x7', 'x0 x4 x8', 'x0 x4 x9', 'x0 x7 x9']
Out[46]:
x0 x3^2 x0 x3 x4 x0 x3 x7 x0 x3 x8 x0 x3 x9 x0 x4^2 x0 x4 x7 x0 x4 x8 x0 x4 x9 x0 x7 x9
0 -7.443502e-08 -8.748043e-08 -4.253278e-08 -3.143425e-08 -2.081510e-09 -1.028122e-07 -4.998703e-08 -3.694339e-08 -2.446314e-09 -1.189392e-09
1 -2.467834e-08 -2.971130e-08 -2.184673e-08 -5.069979e-09 -2.230736e-08 -3.577071e-08 -2.630221e-08 -6.103964e-09 -2.685678e-08 -1.974780e-08
2 -4.853455e-09 -4.775083e-09 -3.140266e-09 -1.343304e-09 -3.811016e-09 -4.697977e-09 -3.089558e-09 -1.321613e-09 -3.749477e-09 -2.465790e-09
3 -1.838418e-08 -1.945015e-08 -1.206412e-08 -1.174795e-08 -7.945526e-09 -2.057792e-08 -1.276363e-08 -1.242913e-08 -8.406230e-09 -5.214034e-09
4 -2.127363e-09 -1.927440e-09 1.094801e-09 6.250990e-10 1.149961e-09 -1.746306e-09 9.919151e-10 5.663544e-10 1.041892e-09 -5.918025e-10
SGD Regressor
In [ ]:
 
In [47]:
model = SGDRegressor(max_iter=2000)
scores_dict = scores(model, poly_train, train["logerror"], 
                            poly_test[poly_train.columns.tolist()],  test["logerror"], k, scorer=scorer)
scores_dict
Out[47]:
{'test': 0.069216629520002904,
 'train': 0.069405963271598953,
 'validation': 0.069464521862867404}
In [48]:
results.append({"model": "Polynomial SGD Regressor", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"],
                "score_cv": scores_dict["validation"], 
                "tags": "degree=4"})
Ridge
In [49]:
model = Ridge()
scores_dict = scores(model, poly_train, train["logerror"], 
                            poly_test[poly_train.columns.tolist()],  test["logerror"], k, scorer=scorer)
scores_dict
Out[49]:
{'test': 0.069185362705891554,
 'train': 0.069376751439138476,
 'validation': 0.069362720718971885}
In [50]:
results.append({"model": "Polynomial Ridge", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"],
                "score_cv": scores_dict["validation"], 
                "tags": "degree=4"})
In [51]:
model = Ridge(alpha=0.5)
scores_dict = scores(model, poly_train, train["logerror"], 
                            poly_test[poly_train.columns.tolist()],  test["logerror"], k, scorer=scorer)
scores_dict
Out[51]:
{'test': 0.069185342531298724,
 'train': 0.069376568422738413,
 'validation': 0.069362734787348349}
In [52]:
results.append({"model": "Polynomial Ridge", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "degree=4, alpha=0.5"})
In [53]:
model = Ridge(alpha=10)
scores_dict = scores(model, poly_train, train["logerror"], 
                            poly_test[poly_train.columns.tolist()],  test["logerror"], k, scorer=scorer)
scores_dict
Out[53]:
{'test': 0.069185381225803394,
 'train': 0.069376919449278929,
 'validation': 0.069362707808782537}
In [54]:
results.append({"model": "Polynomial Ridge", 
                "score_test": scores_dict["test"],
                "score_train": scores_dict["train"], 
                "score_cv": scores_dict["validation"], 
                "tags": "degree=4, alpha=10.0"})
In [ ]:
 
In [ ]:
 
In [55]:
results_df = pd.DataFrame(results)
results_df.sort_values("score_cv")
Out[55]:
model score_cv score_test score_train tags
0 linear SGDRegressor 0.069088 0.068942 0.069135 alpha0.001, max_iter=1000
5 linear Ridge 0.069193 0.069037 0.069217 alpha=10
2 linear SGDRegressor 0.069195 0.069004 0.069189 alpha0.0001, max_iter=1000
4 linear Ridge 0.069215 0.069086 0.069243 alpha=2
1 linear SGDRegressor 0.069216 0.069130 0.069305 alpha0.0001, max_iter=2000
3 linear Ridge 0.069236 0.069103 0.069251 NaN
6 linear Ridge 0.069260 0.069119 0.069262 alpha=0.5
7 linear Ridge 0.069299 0.069137 0.069277 alpha=0.2
11 Polynomial Ridge 0.069363 0.069185 0.069377 degree=4, alpha=10.0
9 Polynomial Ridge 0.069363 0.069185 0.069377 degree=4
10 Polynomial Ridge 0.069363 0.069185 0.069377 degree=4, alpha=0.5
8 Polynomial SGD Regressor 0.069465 0.069217 0.069406 degree=4
In [56]:
results_df.to_csv("results_df2.csv", index=False)
In [ ]:
 
In [67]:
mlp_results = []
Multi Layer Perceptron Regressor
In [68]:
# [int(len(train[use_columns].columns)) * 1.5] * 10
In [69]:
for hidden_layer_sizes in [10,50,100,200,300,400]:
    model = MLPRegressor(hidden_layer_sizes=[int(len(train[use_columns].columns) * 1.5)] * hidden_layer_sizes)
    print("hidden_layer_sizes:", hidden_layer_sizes)
    scores_dict = scores(model, train[use_columns], train["logerror"], 
                            test[use_columns],  test["logerror"], k, scorer=scorer)
    mlp_results.append({"model": "linear MLP", "score_test": scores_dict["test"], "score_train": scores_dict["train"], "score_cv": scores_dict["validation"], "tags": "hidden_layer_sizes=" + str(hidden_layer_sizes)})
    print(pd.DataFrame(mlp_results))
hidden_layer_sizes: 10
        model  score_cv  score_test  score_train                   tags
0  linear MLP  0.069941     0.06882     0.069079  hidden_layer_sizes=10
hidden_layer_sizes: 50
        model  score_cv  score_test  score_train                   tags
0  linear MLP  0.069941    0.068820     0.069079  hidden_layer_sizes=10
1  linear MLP  0.069253    0.069231     0.069419  hidden_layer_sizes=50
hidden_layer_sizes: 100
        model  score_cv  score_test  score_train                    tags
0  linear MLP  0.069941    0.068820     0.069079   hidden_layer_sizes=10
1  linear MLP  0.069253    0.069231     0.069419   hidden_layer_sizes=50
2  linear MLP  0.070052    0.068913     0.069126  hidden_layer_sizes=100
hidden_layer_sizes: 200
        model  score_cv  score_test  score_train                    tags
0  linear MLP  0.069941    0.068820     0.069079   hidden_layer_sizes=10
1  linear MLP  0.069253    0.069231     0.069419   hidden_layer_sizes=50
2  linear MLP  0.070052    0.068913     0.069126  hidden_layer_sizes=100
3  linear MLP  0.069571    0.068958     0.069204  hidden_layer_sizes=200
hidden_layer_sizes: 300
        model  score_cv  score_test  score_train                    tags
0  linear MLP  0.069941    0.068820     0.069079   hidden_layer_sizes=10
1  linear MLP  0.069253    0.069231     0.069419   hidden_layer_sizes=50
2  linear MLP  0.070052    0.068913     0.069126  hidden_layer_sizes=100
3  linear MLP  0.069571    0.068958     0.069204  hidden_layer_sizes=200
4  linear MLP  0.069356    0.070372     0.070499  hidden_layer_sizes=300
hidden_layer_sizes: 400
        model  score_cv  score_test  score_train                    tags
0  linear MLP  0.069941    0.068820     0.069079   hidden_layer_sizes=10
1  linear MLP  0.069253    0.069231     0.069419   hidden_layer_sizes=50
2  linear MLP  0.070052    0.068913     0.069126  hidden_layer_sizes=100
3  linear MLP  0.069571    0.068958     0.069204  hidden_layer_sizes=200
4  linear MLP  0.069356    0.070372     0.070499  hidden_layer_sizes=300
5  linear MLP  0.069638    0.069717     0.070022  hidden_layer_sizes=400
In [ ]:
 
In [ ]:
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                
In [70]:
for hidden_layer_sizes in [10,50,100,200,300,400]:
    model = MLPRegressor(hidden_layer_sizes=[int(len(poly_train.columns) * 1)] * hidden_layer_sizes)
    print("hidden_layer_sizes:", hidden_layer_sizes)
    scores_dict = scores(model, poly_train, train["logerror"], 
                            poly_test[poly_train.columns.tolist()],  test["logerror"], k, scorer=scorer)
    mlp_results.append({"model": "polynomial MLP", "score_test": scores_dict["test"], "score_train": scores_dict["train"], "score_cv": scores_dict["validation"], "tags": "hidden_layer_sizes=" + str(hidden_layer_sizes)})
    print(pd.DataFrame(mlp_results))
hidden_layer_sizes: 10
            model  score_cv  score_test  score_train                    tags
0      linear MLP  0.069941    0.068820     0.069079   hidden_layer_sizes=10
1      linear MLP  0.069253    0.069231     0.069419   hidden_layer_sizes=50
2      linear MLP  0.070052    0.068913     0.069126  hidden_layer_sizes=100
3      linear MLP  0.069571    0.068958     0.069204  hidden_layer_sizes=200
4      linear MLP  0.069356    0.070372     0.070499  hidden_layer_sizes=300
5      linear MLP  0.069638    0.069717     0.070022  hidden_layer_sizes=400
6  polynomial MLP  0.069163    0.069374     0.069553   hidden_layer_sizes=10
hidden_layer_sizes: 50
            model  score_cv  score_test  score_train                    tags
0      linear MLP  0.069941    0.068820     0.069079   hidden_layer_sizes=10
1      linear MLP  0.069253    0.069231     0.069419   hidden_layer_sizes=50
2      linear MLP  0.070052    0.068913     0.069126  hidden_layer_sizes=100
3      linear MLP  0.069571    0.068958     0.069204  hidden_layer_sizes=200
4      linear MLP  0.069356    0.070372     0.070499  hidden_layer_sizes=300
5      linear MLP  0.069638    0.069717     0.070022  hidden_layer_sizes=400
6  polynomial MLP  0.069163    0.069374     0.069553   hidden_layer_sizes=10
7  polynomial MLP  0.069153    0.068882     0.069099   hidden_layer_sizes=50
hidden_layer_sizes: 100
            model  score_cv  score_test  score_train                    tags
0      linear MLP  0.069941    0.068820     0.069079   hidden_layer_sizes=10
1      linear MLP  0.069253    0.069231     0.069419   hidden_layer_sizes=50
2      linear MLP  0.070052    0.068913     0.069126  hidden_layer_sizes=100
3      linear MLP  0.069571    0.068958     0.069204  hidden_layer_sizes=200
4      linear MLP  0.069356    0.070372     0.070499  hidden_layer_sizes=300
5      linear MLP  0.069638    0.069717     0.070022  hidden_layer_sizes=400
6  polynomial MLP  0.069163    0.069374     0.069553   hidden_layer_sizes=10
7  polynomial MLP  0.069153    0.068882     0.069099   hidden_layer_sizes=50
8  polynomial MLP  0.069727    0.069243     0.069430  hidden_layer_sizes=100
hidden_layer_sizes: 200
            model  score_cv  score_test  score_train                    tags
0      linear MLP  0.069941    0.068820     0.069079   hidden_layer_sizes=10
1      linear MLP  0.069253    0.069231     0.069419   hidden_layer_sizes=50
2      linear MLP  0.070052    0.068913     0.069126  hidden_layer_sizes=100
3      linear MLP  0.069571    0.068958     0.069204  hidden_layer_sizes=200
4      linear MLP  0.069356    0.070372     0.070499  hidden_layer_sizes=300
5      linear MLP  0.069638    0.069717     0.070022  hidden_layer_sizes=400
6  polynomial MLP  0.069163    0.069374     0.069553   hidden_layer_sizes=10
7  polynomial MLP  0.069153    0.068882     0.069099   hidden_layer_sizes=50
8  polynomial MLP  0.069727    0.069243     0.069430  hidden_layer_sizes=100
9  polynomial MLP  0.069410    0.068862     0.069084  hidden_layer_sizes=200
hidden_layer_sizes: 300
             model  score_cv  score_test  score_train                    tags
0       linear MLP  0.069941    0.068820     0.069079   hidden_layer_sizes=10
1       linear MLP  0.069253    0.069231     0.069419   hidden_layer_sizes=50
2       linear MLP  0.070052    0.068913     0.069126  hidden_layer_sizes=100
3       linear MLP  0.069571    0.068958     0.069204  hidden_layer_sizes=200
4       linear MLP  0.069356    0.070372     0.070499  hidden_layer_sizes=300
5       linear MLP  0.069638    0.069717     0.070022  hidden_layer_sizes=400
6   polynomial MLP  0.069163    0.069374     0.069553   hidden_layer_sizes=10
7   polynomial MLP  0.069153    0.068882     0.069099   hidden_layer_sizes=50
8   polynomial MLP  0.069727    0.069243     0.069430  hidden_layer_sizes=100
9   polynomial MLP  0.069410    0.068862     0.069084  hidden_layer_sizes=200
10  polynomial MLP  0.069492    0.068963     0.069210  hidden_layer_sizes=300
hidden_layer_sizes: 400
             model  score_cv  score_test  score_train                    tags
0       linear MLP  0.069941    0.068820     0.069079   hidden_layer_sizes=10
1       linear MLP  0.069253    0.069231     0.069419   hidden_layer_sizes=50
2       linear MLP  0.070052    0.068913     0.069126  hidden_layer_sizes=100
3       linear MLP  0.069571    0.068958     0.069204  hidden_layer_sizes=200
4       linear MLP  0.069356    0.070372     0.070499  hidden_layer_sizes=300
5       linear MLP  0.069638    0.069717     0.070022  hidden_layer_sizes=400
6   polynomial MLP  0.069163    0.069374     0.069553   hidden_layer_sizes=10
7   polynomial MLP  0.069153    0.068882     0.069099   hidden_layer_sizes=50
8   polynomial MLP  0.069727    0.069243     0.069430  hidden_layer_sizes=100
9   polynomial MLP  0.069410    0.068862     0.069084  hidden_layer_sizes=200
10  polynomial MLP  0.069492    0.068963     0.069210  hidden_layer_sizes=300
11  polynomial MLP  0.069665    0.070081     0.070222  hidden_layer_sizes=400
In [ ]:
 
In [71]:
mlp_results_df = pd.DataFrame(mlp_results).sort_values("score_cv")
In [ ]:
 
In [72]:
mlp_results_df["hidden_layer_sizes"] = mlp_results_df["tags"].apply(lambda value: int(value.split("=")[1]))
mlp_results_df = mlp_results_df.sort_values("hidden_layer_sizes")
mlp_results_df.sort_values("score_cv")
Out[72]:
model score_cv score_test score_train tags hidden_layer_sizes
7 polynomial MLP 0.069153 0.068882 0.069099 hidden_layer_sizes=50 50
6 polynomial MLP 0.069163 0.069374 0.069553 hidden_layer_sizes=10 10
1 linear MLP 0.069253 0.069231 0.069419 hidden_layer_sizes=50 50
4 linear MLP 0.069356 0.070372 0.070499 hidden_layer_sizes=300 300
9 polynomial MLP 0.069410 0.068862 0.069084 hidden_layer_sizes=200 200
10 polynomial MLP 0.069492 0.068963 0.069210 hidden_layer_sizes=300 300
3 linear MLP 0.069571 0.068958 0.069204 hidden_layer_sizes=200 200
5 linear MLP 0.069638 0.069717 0.070022 hidden_layer_sizes=400 400
11 polynomial MLP 0.069665 0.070081 0.070222 hidden_layer_sizes=400 400
8 polynomial MLP 0.069727 0.069243 0.069430 hidden_layer_sizes=100 100
0 linear MLP 0.069941 0.068820 0.069079 hidden_layer_sizes=10 10
2 linear MLP 0.070052 0.068913 0.069126 hidden_layer_sizes=100 100
In [73]:
mlp_results_df.to_csv("mlp_results_df2.csv", index=False)
In [78]:
from bokeh.plotting import figure
from bokeh.io import output_notebook, show
from bokeh.layouts import gridplot, row
output_notebook()
Loading BokehJS ...
In [104]:
poli_mlp = mlp_results_df[mlp_results_df["model"] == "polynomial MLP"]
linear_mlp = mlp_results_df[mlp_results_df["model"] == "linear MLP"]
In [127]:
results_df.groupby("model")["score_cv", "score_test", "score_train"].apply(median)
results_df[results_df["model"].str.contains("SGD")]["score_train"].median()
Out[127]:
0.06924698014531981
In [97]:
# poli_mlp = poli_mlp.append(pd.DataFrame([{"model": "Polynomial SGD Regressor", "score_cv": 0.069465, "score_test": 0.069217, "score_train": 0.069406, "tags": "SGD", "hidden_layer_sizes": 0}]))
# linear_mlp = linear_mlp.append(pd.DataFrame([{"model": "Linear SGD Regressor", "score_cv": 0.069088, "score_test": 0.068942, "score_train": 0.069135, "tags": "SGD", "hidden_layer_sizes": 0}]))
In [98]:
# poli_mlp = poli_mlp.sort_values("hidden_layer_sizes")
# linear_mlp = linear_mlp.sort_values("hidden_layer_sizes")
In [138]:
def plot_mlp_scores(mlp_data, range_y, title, sgd_data=None):
    p = figure(width=400, height=400, tools=["save", "xpan", "xwheel_zoom", "reset"], 
               x_axis_label = "n_layers", y_axis_label = "score", title=title, y_range=range_y)
    
    p.line(x=mlp_data["hidden_layer_sizes"], y=mlp_data["score_train"], 
       line_width=2, color="red", legend="score_train")
    
    p.line(x=mlp_data["hidden_layer_sizes"], y=mlp_data["score_cv"], 
       line_width=2, color="green", legend="score_cv")
    
#     p.line(x=data["hidden_layer_sizes"], y=data["score_test"], 
#        line_width=2, color="blue", legend="score_test")

    if not sgd_data is None:
        p.circle(x=0, y=sgd_data[sgd_data["model"].str.contains("SGD")]["score_train"].median(), color="red")
        p.circle(x=0, y=sgd_data[sgd_data["model"].str.contains("SGD")]["score_cv"].median(), color="green")
    
    return p
In [ ]:
 
In [156]:
range_y = (0.066, 0.0754)

p1 = plot_mlp_scores(linear_mlp, range_y, title="Linear MLP", sgd_data=results_df[results_df["model"].str.contains("linear")])
p2 = plot_mlp_scores(poli_mlp, range_y, title="Polynomial MLP", sgd_data=results_df[results_df["model"].str.contains("Polyn")])
grid = gridplot([[p1, p2]])
show(grid)